Estimating Directional Route Mileage by UZA

Author

Steven Andrews, Boston Region MPO

Published

October 3, 2024

library(sf)
library(readxl)
library(tidyverse)
library(mapview)
library(gtfstools)
library(stringr)
library(gt)
mapviewOptions(fgb = FALSE)

UZA Work

We write a function to grab the files from the Census Bureau site and perform light filtering and manipulation.

sf_from_zip <- function(URL) {
  # based on:
  # https://stackoverflow.com/questions/59740419/unzipping-and-reading-shape-file-in-r-without-rgdal-installed
  
  # Load local temp files
  temp_0 <- tempfile()
  temp_1 <- tempfile()
  
  # download the zip, put it in the first temp directory
  download.file(URL, temp_0)
  
  # Unzip and place in temp_1
  unzip(zipfile = temp_0, exdir = temp_1)
  
  # Read the shapefile.
  geom_sf <- sf::read_sf(temp_1)
  
  out <- geom_sf
}
# Record the URLs.
UZA20_URL <- "https://www2.census.gov/geo/tiger/TIGER2020/UAC/tl_2020_us_uac20.zip"

# Obtain the data.
UZA20 <- sf_from_zip(UZA20_URL)

# Read data from population file:
# https://www.transit.dot.gov/ntd/2020-census-changes-uzapopulation
UZA20_pop <- read_xlsx("../data/base/UZA_CHANGES_1990-2020_2_2.xlsx", 
                               sheet = "UZA 2020 Master",
                              col_types = "text") |> 
  janitor::clean_names() |> 
  mutate(x2020_uace = str_pad(x2020_uace, side = "left", pad = "0", 5))

# Transform to state coordinate system.
UZA20 <- UZA20 |> sf::st_transform(26986)

NE <- c("MA", "CT", "RI", "VT", "NH", "ME")
NE_collapse <- paste0(NE,collapse = "|")

# There are only Urban Areas in 2020.
UZA20_NE <- UZA20 |>
  filter(str_detect(NAME20, NE_collapse)) |> 
  left_join(UZA20_pop |> select(x2020_uace, x2020_population), by = c("GEOID20" = "x2020_uace")) |> 
    mutate(UZA_type = if_else(!is.na(x2020_population), 
                            "Pop: GTE 50k", 
                            "Pop: LT 50k"))

saveRDS(UZA20_NE, "../data/processed/UZA20_NE.rds")

CR Line Work

We start by downloading the last Summer 2023 GTFS file from the MBTA’s GTFS archive. This file contains information for the CapeFlyer and has the recent Foxboro Branch. We remove the segments running from the Providence Line to Foxboro as they are not typical service.

# This is the last Summer 2022 GTFS schedule. 
# We chose this because it contains the CapeFlyer if we need it. 
gtfs_summ23 <- gtfstools::read_gtfs("https://cdn.mbtace.com/archive/20230810.zip")

# Convert shapes to MA system.
gtfs_summ23_shp <- convert_shapes_to_sf(gtfs_summ23) |> 
  st_transform(26986)

# Attach them to our route_ids so we can have hints about what each shape is.
gtfs_summ23_shp <-
  inner_join(gtfs_summ23_shp,
             gtfs_summ23$trips |> distinct(route_id, shape_id),
             by = "shape_id")

gtfs_summ23_stp <- gtfs_summ23 |> convert_stops_to_sf() 
gtfs_summ23_stp <- gtfs_summ23_stp |> filter(vehicle_type == 2) |> 
  distinct(stop_name, .keep_all = TRUE)

We need to filter the shapes for commuter rail lines. We can view the shapes to confirm that this matches the system as we know it.

gtfs_summ23_shp_filt <- gtfs_summ23_shp |> 
  filter(str_detect(route_id, "CR-") | str_detect(route_id, "Cape")) |> filter(!shape_id %in% c("canonical-SouFoxProv", "canonical-ProvidenceToSouthStationViaFoxboro", "ProvidenceToSouthStationViaFoxboro")) # remove fox from; south not regular service

# Filter out the capeflyer
gtfs_summ23_shp_filt <- gtfs_summ23_shp_filt |> 
  filter(!str_detect(route_id, "Cape"))

gtfs_summ23_shp_filt |> mapview(zcol = "route_id", color = viridis::turbo)

The shapes representing some of the same tracks are very close but not exactly the same. If not addressed, this would cause an inflation in directional route miles. We can buffer the routes slightly to coalesce similar track segments.

Note

Zoom into a station like Readville to see the effects of the buffer on the final result. You’ll notice a “cupping” where we would really want something resembling an “X” through the junction.

# Buffer out by 20m, then in by -19 meters -1*(20-1). A visual inspection suggested that 20m was enough distance to capture nearby alignments. 

d <- 20

gtfs_summ23_shp_filt_buff <- gtfs_summ23_shp_filt |> 
  st_buffer(d, endCapStyle = "SQUARE", joinStyle = "ROUND") |>
  summarize() |> 
  st_buffer(-1 * (d-1), endCapStyle = "SQUARE", joinStyle = "ROUND")

mapview(gtfs_summ23_shp_filt_buff) + 
  mapview(gtfs_summ23_shp_filt, 
          zcol = "route_id", 
          color = viridis::turbo)

We take the outside line of the buffer which gives us an approximation of the length of the routes. In this case, because we are concerned with directional route miles, each line can be thought of as one direction of the route, meaning we do not have to divide by two to get the length of the centerline.

gtfs_summ23_shp_line <- gtfs_summ23_shp_filt_buff |> st_cast(to = "MULTILINESTRING")
saveRDS(gtfs_summ23_shp_line,
        "../data/processed/gtfs_summ23_shp_line.rds")

Perform Intersections

Read in the pre-generated datasets.

# Read in trains
trains_pass <- readRDS("../data/processed/gtfs_summ23_shp_line.rds")

# Read in UZA
UZA20_NE <- readRDS("../data/processed/UZA20_NE.rds")

# Create labels for helpful hovering.
labs20 <- UZA20_NE$NAME20

We perform an intersection over each of the datasets.

trains_pass_UZA20 <- st_intersection(trains_pass, UZA20_NE)

We map the datasets, showing only UZAs in MA.

Aggregate by UZA

We want to aggregate the data by UZA. We do not need to divide the length of the buffer by two because we are using the outside of the buffer, which is essentially already a doubled centerline.

# Find the total mileage of the CR system. (In meters!)
trains_pass$len <- st_length(trains_pass) |> 
  units::drop_units()

# Aggregate for the total.
trains_pass_len_tot <- sum(trains_pass$len)

# Calculate the length then drop the units for 2010 and 2020.  

trains_pass_UZA20$len <- st_length(trains_pass_UZA20) |> 
  units::drop_units()

# Summarize 2020 by Geometry ---------------
trains_pass_UZA20_summ <- trains_pass_UZA20 |> 
  group_by(NAME20, UZA_type) |> 
  summarize(len_tot = sum(len)) |> 
  st_drop_geometry() |> 
  ungroup()

# Find length in UZAs
UZA20_len <- sum(trains_pass_UZA20_summ$len_tot)

# Subtract and attach the non-UZA length. Make a fresh row to append.
trains_pass_UZA20_summ <- bind_rows(
  trains_pass_UZA20_summ,
  tibble(NAME20 = "NonUZA",
         UZA_type = "NonUZA",
         len_tot = trains_pass_len_tot - UZA20_len))

trains_pass_UZA20_summ <- trains_pass_UZA20_summ |>
  mutate(pct = len_tot/sum(len_tot))

# Check Delta
sum(trains_pass_UZA20_summ$len_tot) - trains_pass_len_tot
[1] 0

Label and convert the units then export the data to CSVs.

# Label Units.
trains_pass_UZA20_summ <- trains_pass_UZA20_summ |> 
  rename(len_tot_m = len_tot) |> 
  mutate(len_tot_mi = measurements::conv_unit(len_tot_m, "m", "mi"),
         len_tot_mi = round(len_tot_mi, 3)) |> 
  select(NAME20, UZA_type, len_tot_mi, pct)

write_csv(trains_pass_UZA20_summ |> arrange(desc(len_tot_mi)), "../output/trains_pass_gtfs_UZA20_summ24.csv")
NAME20 UZA_type len_tot_mi pct
Boston, MA--NH Pop: GTE 50k 581.97 74.64%
Providence, RI--MA Pop: GTE 50k 85.52 10.97%
NonUZA NonUZA 58.56 7.51%
Worcester, MA--CT Pop: GTE 50k 26.55 3.41%
Leominster--Fitchburg, MA Pop: GTE 50k 22.91 2.94%
Ipswich, MA Pop: LT 50k 4.24 0.54%
Total - 779.75 100.00%